1 基于iris数据的示例
1.1 报告描述性统计量
- User提问: 在R中报告iris中各个变量的均值、中位数、标准差、最小值和最大值。
# 加载iris数据集
data(iris)
# 计算均值
mean_values <- round(colMeans(iris[, 1:4]), 3)
# 计算中位数
median_values <- sapply(iris[, 1:4], median)
# 计算标准差
sd_values <- round(sapply(iris[, 1:4], sd), 3)
# 计算最小值
min_values <- sapply(iris[, 1:4], min)
# 计算最大值
max_values <- sapply(iris[, 1:4], max)
# 创建一个数据框包含统计量
statistics <- data.frame(
Variable = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
Mean = mean_values,
Median = median_values,
SD = sd_values,
Min = min_values,
Max = max_values
)
# 打印报告
print(statistics) Variable Mean Median SD Min Max
Sepal.Length Sepal.Length 5.843 5.80 0.828 4.3 7.9
Sepal.Width Sepal.Width 3.057 3.00 0.436 2.0 4.4
Petal.Length Petal.Length 3.758 4.35 1.765 1.0 6.9
Petal.Width Petal.Width 1.199 1.30 0.762 0.1 2.5
- User:
结果中最小值和最大值都只有1位小数,如何修改?
# 加载iris数据集
data(iris)
# 计算均值
mean_values <- format(round(colMeans(iris[, 1:4]), 3), nsmall = 3)
# 计算中位数
median_values <- format(round(sapply(iris[, 1:4], median), 3), nsmall = 3)
# 计算标准差
sd_values <- format(round(sapply(iris[, 1:4], sd), 3), nsmall = 3)
# 计算最小值
min_values <- format(round(sapply(iris[, 1:4], min), 3), nsmall = 3)
# 计算最大值
max_values <- format(round(sapply(iris[, 1:4], max), 3), nsmall = 3)
# 创建一个数据框包含统计量
statistics <- data.frame(
Variable = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
Mean = mean_values,
Median = median_values,
SD = sd_values,
Min = min_values,
Max = max_values
)
# 移除Variable列名
rownames(statistics) <- NULL
# 打印报告
print(statistics) Variable Mean Median SD Min Max
1 Sepal.Length 5.843 5.800 0.828 4.300 7.900
2 Sepal.Width 3.057 3.000 0.436 2.000 4.400
3 Petal.Length 3.758 4.350 1.765 1.000 6.900
4 Petal.Width 1.199 1.300 0.762 0.100 2.500
- User: 如何将数据框导出到word文档?
library(officer)
# 创建一个空白的Word文档
doc <- read_docx()
# 添加数据框
table <- as.data.frame(statistics)
doc <- body_add_table(doc, table)
# 保存Word文档
print(doc, target = "iris_table.docx")描述性统计分析导出到Word文档
- User:
按Species分组,报告各组均值,结果保留3位小数?
# 加载iris数据集
data(iris)
# 按Species分组计算均值,并保留3位小数
group_statistics <- aggregate(. ~ Species, data = iris, FUN = function(x) {
round(mean(x), 3)
})
# 打印分组均值
print(group_statistics) Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
1.2 绘制分组箱线图
- User提问:
按Species分组,绘制各变量的箱线图。
# 加载iris数据集
data(iris)
# 按Species分组绘制箱线图
par(mfrow = c(2, 2)) # 设置图形布局为2行2列
# 绘制Sepal.Length的箱线图
boxplot(Sepal.Length ~ Species, data = iris,
main = "Sepal Length", xlab = "Species", ylab = "Sepal Length")
# 绘制Sepal.Width的箱线图
boxplot(Sepal.Width ~ Species, data = iris, main = "Sepal Width", xlab = "Species", ylab = "Sepal Width")
# 绘制Petal.Length的箱线图
boxplot(Petal.Length ~ Species, data = iris, main = "Petal Length", xlab = "Species", ylab = "Petal Length")
# 绘制Petal.Width的箱线图
boxplot(Petal.Width ~ Species, data = iris, main = "Petal Width", xlab = "Species", ylab = "Petal Width")- User提问:
用ggplot2绘制按Species分组后,各变量的箱线图。
# 加载所需的库
library(ggplot2)
# 加载iris数据集
data(iris)
# 按Species分组绘制箱线图
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
geom_boxplot() +
labs(title = "Sepal Length", x = "Species", y = "Sepal Length")ggplot(iris, aes(x = Species, y = Sepal.Width, fill = Species)) +
geom_boxplot() +
labs(title = "Sepal Width", x = "Species", y = "Sepal Width")ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
geom_boxplot() +
labs(title = "Petal Length", x = "Species", y = "Petal Length")ggplot(iris, aes(x = Species, y = Petal.Width, fill = Species)) +
geom_boxplot() +
labs(title = "Petal Width", x = "Species", y = "Petal Width")- User提问:让上述4个图形显示在同一窗口。
# 加载所需的库
library(ggplot2)
# 加载iris数据集
data(iris)
# 创建绘图对象,按Species分组绘制箱线图
p <- ggplot(iris, aes(x = Species, fill = Species)) +
labs(x = "Species") +
theme_minimal() +
theme(legend.position = "none")
# 添加Sepal.Length的箱线图
p1 <- p +
geom_boxplot(aes(y = Sepal.Length)) +
labs(title = "Sepal Length")
# 添加Sepal.Width的箱线图
p2 <- p +
geom_boxplot(aes(y = Sepal.Width)) +
labs(title = "Sepal Width")
# 添加Petal.Length的箱线图
p3 <- p +
geom_boxplot(aes(y = Petal.Length)) +
labs(title = "Petal Length")
# 添加Petal.Width的箱线图
p4 <- p +
geom_boxplot(aes(y = Petal.Width)) +
labs(title = "Petal Width")
# 在同一窗口中显示四个箱线图
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)- User提问: Error in loadNamespace(x) :
there is no package called ‘gridExtra’
1.3 方差分析
- User提问:
将iris按照Species分组,检验三个组别的Sepal.length的均值是否相等?
# 加载所需的库
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)
# 加载iris数据集
data(iris)
# 进行方差分析
anova_result <- aov(Sepal.Length ~ Species, data = iris)
# 检验均值是否相等
anova_summary <- summary(anova_result)
# 提取p-value
p_value <- anova_summary[[1]]$"Pr(>F)"[1]
# 输出方差分析结果
print(anova_summary) Df Sum Sq Mean Sq F value Pr(>F)
Species 2 63.21 31.606 119.3 <2e-16 ***
Residuals 147 38.96 0.265
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n")cat("p-value:", p_value)p-value: 1.669669e-31
- User提问:
该数据满足ANOVA检验的适用条件吗?
- User提问:
iris数据集每个组别内的Sepal.Length观测值是否近似正态分布?
# 加载所需的库
library(dplyr)
library(ggplot2)
# 加载iris数据集
data(iris)
# 按Species分组,进行正态性检验
normality_test <- iris %>%
group_by(Species) %>%
summarise(p_value = shapiro.test(Sepal.Length)$p.value)
# 输出正态性检验结果
print(normality_test)# A tibble: 3 × 2
Species p_value
<fct> <dbl>
1 setosa 0.460
2 versicolor 0.465
3 virginica 0.258
- User提问: R 中进行方差齐性检验的函数是什么?
- User提问: bartlett.test()函数的结果如何看?
bartlett.test(iris$Sepal.Length,iris$Species)
Bartlett test of homogeneity of variances
data: iris$Sepal.Length and iris$Species
Bartlett's K-squared = 16.006, df = 2, p-value = 0.0003345
- User提问: iris数据集不同组别之间的Sepal.Length观测值方差不等,还能用ANOVA检验吗?
- User提问: Welch’s ANOVA的命令是什么?
# 导入iris数据集
data(iris)
# 执行Welch's ANOVA
result <- oneway.test(Sepal.Length ~ Species, data = iris)
# 打印结果
print(result)
One-way analysis of means (not assuming equal variances)
data: Sepal.Length and Species
F = 138.91, num df = 2.000, denom df = 92.211, p-value < 2.2e-16
1.4 判别分析
- User提问:基于iris数据,建立Species的判别函数?
- User提问: could not find function “lda”
# Install and load the MASS package
#install.packages("MASS")
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
# Load the iris dataset
data(iris)
# Perform LDA
lda_model <- lda(Species ~ ., data = iris)
# Print the discriminant function results
print(lda_model)Call:
lda(Species ~ ., data = iris)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa 5.006 3.428 1.462 0.246
versicolor 5.936 2.770 4.260 1.326
virginica 6.588 2.974 5.552 2.026
Coefficients of linear discriminants:
LD1 LD2
Sepal.Length 0.8293776 -0.02410215
Sepal.Width 1.5344731 -2.16452123
Petal.Length -2.2012117 0.93192121
Petal.Width -2.8104603 -2.83918785
Proportion of trace:
LD1 LD2
0.9912 0.0088
- User提问:输出结果如何解读?
-
User提问:如何利用判别函数判定一朵鸢尾花属于哪个组别?
# 用于预测的鸢尾花特征向量
new_iris <- data.frame(Sepal.Length = 5.1, Sepal.Width = 3.5, Petal.Length = 1.4, Petal.Width = 0.2)
# 使用判别函数进行预测
predicted_group <- predict(lda_model, newdata = new_iris)
# 输出预测结果
print(predicted_group$class)[1] setosa
Levels: setosa versicolor virginica
1.5 向ChatGPT提问技巧
如何在R中实现XX计算/检验/方法?请用R自带的数据集举例说明。
用XX包在R中实现XX计算/检验?请用R自带的数据集举例说明。
- 指定包的名称,例如ggplot2
XX计算/检验/方法的适用条件是什么?
在R中如何考察XX方法/检验的适用条件是否满足?
如何解读XX函数的输出结果的含义?
R中的XX分析有哪些可视化工具?
2 如何借助ChatGPT开展统计分析?
2.1 描述数据结构和分析目标
- User提问: 我有实验组和对照组各10人的血压数据,如何判断两个组别的血压是否有显著差距?
- User提问: 我的两个组别分别只有10人,能用独立样本t检验吗?
2.2 统计方法的选择:样本容量、数据类型、分布特征
- User提问: 还有别的方法吗?
- User提问: Wilcoxon Signed-Rank Test的R代码是什么?
-
User提问:我有控制组和对照组各10人的跟踪观测服药7天,14天和21天的血压数据,可以开展什么统计分析,来判断药物是否有效?
2.3 请求chatGPT创建模拟数据
- User提问:请在R中生成上述模拟数据,及实现Repeated Measures ANOVA的代码?
# 生成模拟数据
set.seed(123) # 设置随机种子以保证结果的可重复性
# 创建数据框
data <- data.frame(
Subject = rep(1:10, each = 3), # 受试者编号,重复测量所以有10个受试者
Time = rep(c("Day 7", "Day 14", "Day 21"), times = 10), # 时间点
Group = rep(c("Control", "Treatment"), each = 15), # 组别
BloodPressure = rnorm(30, mean = c(120, 130, 125), sd = c(5, 7, 6)) # 血压数据
)
# 查看数据
print(data) Subject Time Group BloodPressure
1 1 Day 7 Control 117.1976
2 1 Day 14 Control 128.3888
3 1 Day 21 Control 134.3522
4 2 Day 7 Control 120.3525
5 2 Day 14 Control 130.9050
6 2 Day 21 Control 135.2904
7 3 Day 7 Control 122.3046
8 3 Day 14 Control 121.1446
9 3 Day 21 Control 120.8789
10 4 Day 7 Control 117.7717
11 4 Day 14 Control 138.5686
12 4 Day 21 Control 127.1589
13 5 Day 7 Control 122.0039
14 5 Day 14 Control 130.7748
15 5 Day 21 Control 121.6650
16 6 Day 7 Treatment 128.9346
17 6 Day 14 Treatment 133.4850
18 6 Day 21 Treatment 113.2003
19 7 Day 7 Treatment 123.5068
20 7 Day 14 Treatment 126.6905
21 7 Day 21 Treatment 118.5931
22 8 Day 7 Treatment 118.9101
23 8 Day 14 Treatment 122.8180
24 8 Day 21 Treatment 120.6267
25 9 Day 7 Treatment 116.8748
26 9 Day 14 Treatment 118.1931
27 9 Day 21 Treatment 130.0267
28 10 Day 7 Treatment 120.7669
29 10 Day 14 Treatment 122.0330
30 10 Day 21 Treatment 132.5229
# 进行重复测量方差分析(Repeated Measures ANOVA)
library(nlme)
Attaching package: 'nlme'
The following object is masked from 'package:dplyr':
collapse
# 设置数据为重复测量设计
data$Subject <- factor(data$Subject)
data$Time <- factor(data$Time)
# 执行重复测量方差分析
model <- lme(BloodPressure ~ Time * Group, data = data, random = ~ 1 | Subject)
anova_result <- anova(model)
# 打印重复测量方差分析结果
print(anova_result) numDF denDF F-value p-value
(Intercept) 1 16 13201.338 <.0001
Time 2 16 3.112 0.0721
Group 1 8 1.635 0.2369
Time:Group 2 16 1.151 0.3412
2.4 解读输出结果
- User提问: 解释结果 > # 打印重复测量方差分析结果
print(anova_result)
numDF denDF F-value p-value
(Intercept) 1 16 13201.338 <.0001
Time 2 16 3.112 0.0721
Group 1 8 1.635 0.2369
Time:Group 2 16 1.151 0.3412
3 如何学习ChatGPT生成的代码?
3.1 请ChatGPT给代码添加注释
3.2 请chatGPT解释函数的用法、参数项
3.3 代码报错的应对
3.4 如何模仿ChatGPT生成的代码?
# 导入所需的包
library(e1071)
library(dplyr)
# 划分训练集和测试集
set.seed(123)
train_indices <- sample(1:nrow(iris), nrow(iris) * 0.7)
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]
# 创建SVM模型
svm_model <- svm(Species ~ ., data = train_data)
# 预测测试集
predicted_labels <- predict(svm_model, test_data[, -5])
# 计算预测准确率
accuracy <- sum(predicted_labels == test_data$Species) / nrow(test_data)
cat("预测准确率:", accuracy, "\n")预测准确率: 0.9777778
# 创建结果列表
result_list <- data.frame(
Individual = 1:nrow(test_data),
Actual = test_data$Species,
Predicted = predicted_labels,
Correct = ifelse(test_data$Species == predicted_labels, "Correct", "Incorrect")
)
# 打印结果列表
print(result_list) Individual Actual Predicted Correct
1 1 setosa setosa Correct
2 2 setosa setosa Correct
3 3 setosa setosa Correct
5 4 setosa setosa Correct
11 5 setosa setosa Correct
18 6 setosa setosa Correct
19 7 setosa setosa Correct
28 8 setosa setosa Correct
29 9 setosa setosa Correct
33 10 setosa setosa Correct
36 11 setosa setosa Correct
45 12 setosa setosa Correct
48 13 setosa setosa Correct
49 14 setosa setosa Correct
55 15 versicolor versicolor Correct
56 16 versicolor versicolor Correct
57 17 versicolor versicolor Correct
58 18 versicolor versicolor Correct
59 19 versicolor versicolor Correct
61 20 versicolor versicolor Correct
62 21 versicolor versicolor Correct
65 22 versicolor versicolor Correct
66 23 versicolor versicolor Correct
68 24 versicolor versicolor Correct
70 25 versicolor versicolor Correct
77 26 versicolor versicolor Correct
83 27 versicolor versicolor Correct
84 28 versicolor virginica Incorrect
94 29 versicolor versicolor Correct
95 30 versicolor versicolor Correct
98 31 versicolor versicolor Correct
100 32 versicolor versicolor Correct
101 33 virginica virginica Correct
104 34 virginica virginica Correct
105 35 virginica virginica Correct
111 36 virginica virginica Correct
113 37 virginica virginica Correct
116 38 virginica virginica Correct
125 39 virginica virginica Correct
131 40 virginica virginica Correct
133 41 virginica virginica Correct
135 42 virginica virginica Correct
140 43 virginica virginica Correct
141 44 virginica virginica Correct
145 45 virginica virginica Correct
set.seed(123) # 设置随机种子以确保可重复性
# 生成500个糖尿病患者数据
diabetes_data <- data.frame(
Age = round(rnorm(500, mean = 50, sd = 10)),
BMI = round(rnorm(500, mean = 28, sd = 5)),
Glucose = round(rnorm(500, mean = 120, sd = 30)),
Insulin = round(rnorm(500, mean = 10, sd = 5)),
Exercise = sample(c("None", "Light", "Moderate", "Heavy"), 500, replace = TRUE),
Diet = sample(c("Healthy", "Unhealthy"), 500, replace = TRUE),
Waist = round(rnorm(500, mean = 90, sd = 10)),
Alcohol = sample(c("No", "Occasional", "Regular"), 500, replace = TRUE),
Smoking = sample(c("No", "Occasional", "Regular"), 500, replace = TRUE),
Type = sample(c("Type 1", "Type 2", "Gestational"), 500, replace = TRUE)
)
# 查看更新后的数据集
head(diabetes_data) Age BMI Glucose Insulin Exercise Diet Waist Alcohol Smoking
1 44 25 90 6 Light Healthy 83 No Regular
2 48 23 89 8 Heavy Healthy 96 Occasional Regular
3 66 33 119 5 Light Unhealthy 83 No Regular
4 51 32 116 13 None Healthy 85 Occasional Regular
5 51 20 44 16 Moderate Unhealthy 98 Occasional No
6 67 28 151 21 Moderate Healthy 85 Occasional Occasional
Type
1 Type 2
2 Gestational
3 Type 1
4 Type 2
5 Gestational
6 Gestational
# 划分训练集和测试集
set.seed(123)
train_indices <- sample(1:nrow(diabetes_data), nrow(diabetes_data) * 0.7)
train_data <- diabetes_data[train_indices, ]
test_data <- diabetes_data[-train_indices, ]
# 将目标变量转换为因子类型
train_data$Type <- as.factor(train_data$Type)
test_data$Type <- as.factor(test_data$Type)
# 创建并训练支持向量机模型
svm_model <- svm(Type ~ ., data = train_data, kernel = "radial", cost = 1)
# 在测试集上进行预测
predicted_labels <- predict(svm_model, newdata = test_data)
# 计算预测准确率
accuracy <- sum(predicted_labels == test_data$Type) / nrow(test_data) * 100
cat("预测准确率:", round(accuracy, 2), "%\n")预测准确率: 34 %
# 创建包含预测结果的表格
prediction_table <- data.frame(
Actual = test_data$Type,
Predicted = predicted_labels,
Correct = ifelse(test_data$Type == predicted_labels, "Yes", "No")
)
# 查看预测结果表格
head(prediction_table) Actual Predicted Correct
1 Type 2 Gestational No
3 Type 1 Gestational No
6 Gestational Gestational Yes
8 Type 2 Gestational No
9 Type 1 Gestational No
12 Gestational Type 1 No